Read prepared data.
subscriptions <- read_rds('../data/subscriptions.rds')
summary(subscriptions)
customerid subscriptionid periodend revenuecurr revenuecurrinclvat
Min. : 10006 Min. : 154 Min. :2004-03-15 Min. : 0.8 Min. : 1
1st Qu.: 1109200 1st Qu.: 5195644 1st Qu.:2013-04-10 1st Qu.: 15.0 1st Qu.: 15
Median : 5404300 Median :15011583 Median :2015-04-03 Median : 63.2 Median : 79
Mean : 8160520 Mean :14758473 Mean :2014-09-14 Mean : 1371.7 Mean : 1393
3rd Qu.:14665703 3rd Qu.:24306942 3rd Qu.:2016-11-19 3rd Qu.: 174.4 3rd Qu.: 218
Max. :24113207 Max. :29648411 Max. :2020-03-27 Max. :2823000.0 Max. :2823000
billingcurrency startmonth endmonth isthreetoonesubs months
DKK :669182 Min. :2003-12-01 Min. :2004-03-01 Min. :0.00000 Min. : 1.000
EUR :538177 1st Qu.:2013-01-01 1st Qu.:2013-04-01 1st Qu.:0.00000 1st Qu.: 1.000
USD :513522 Median :2014-12-01 Median :2015-04-01 Median :0.00000 Median : 3.000
NOK :232949 Mean :2014-05-16 Mean :2014-08-31 Mean :0.03175 Mean : 3.527
SEK :159662 3rd Qu.:2016-08-01 3rd Qu.:2016-11-01 3rd Qu.:0.00000 3rd Qu.: 3.000
GBP : 98063 Max. :2018-03-01 Max. :2020-03-01 Max. :1.00000 Max. :24.000
(Other): 89456
status num_previous_months num_previous_subs num_previous_months_binned firstpaiddate
active:1859624 Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. :2003-12-15
churn : 441387 1st Qu.: 1.00 1st Qu.: 1.00 1st Qu.: 1.00 1st Qu.:2011-01-01
Median : 8.00 Median : 3.00 Median : 8.00 Median :2013-07-01
Mean : 15.33 Mean : 6.02 Mean :14.69 Mean :2013-01-30
3rd Qu.: 22.00 3rd Qu.: 8.00 3rd Qu.:26.00 3rd Qu.:2015-09-10
Max. :162.00 Max. :69.00 Max. :39.00 Max. :2018-03-28
channelcat paymentperiodchosenatstart currency marketname siteverkey
paid :1533098 Min. :-1.000 DKK :669320 DK :669659 US :916164
viral: 767913 1st Qu.: 3.000 USD :551762 NO :233280 DK :637648
Median : 3.000 EUR :542911 US :192217 NO :207389
Mean : 4.242 NOK :232831 SE :160491 SE :144376
3rd Qu.: 3.000 SEK :159469 FR :147157 FR : 96237
Max. :24.000 GBP : 95683 (Other):897812 NL : 80165
(Other): 49035 NA's : 395 (Other):219032
firstpaidmonth firstdevice segment isquickpurchase productversion
Min. :2003-12-01 desktop: 186075 business: 417088 Min. :0.0000 v_3 : 904354
1st Qu.:2011-01-01 mobile : 75406 other : 99772 1st Qu.:0.0000 v_4 : 152485
Median :2013-07-01 NA's :2039530 personal: 300111 Median :1.0000 v_older:1243898
Mean :2013-01-16 NA's :1484040 Mean :0.5291 NA's : 274
3rd Qu.:2015-09-01 3rd Qu.:1.0000
Max. :2018-03-01 Max. :1.0000
NA's :274
isfreemium model31224 threetoonestartdate market_category
Min. :0.0000 pre-changes :1904931 Min. :2017-02-14 DK :669659
1st Qu.:0.0000 3-12-24m-2016-08-25 : 150403 1st Qu.:2017-03-14 LowGeneric:269691
Median :0.0000 3-12m-v1-2015-11-30 : 146501 Median :2017-04-18 NO :233280
Mean :0.2602 12m-v2-2015-05-16 : 72411 Mean :2017-05-25 US :192217
3rd Qu.:1.0000 ex-subscriptionplanpaywallexpand: 8512 3rd Qu.:2017-09-06 SE :160491
Max. :1.0000 ex-uglyjerry12mswitch : 5429 Max. :2017-12-19 FR :147157
NA's :274 (Other) : 12824 NA's :2144362 (Other) :628516
siteverkey_cat siteverkey_cat2 chosen_subs_length isthreetoonestate gdppercapita
ORG:1384847 MUT:1317341 1 : 56972 Min. :0.00000 Min. : 218.3
SS : 916164 ORG: 401813 -1 : 11274 1st Qu.:0.00000 1st Qu.: 42013.3
SS : 581857 12 : 52199 Median :0.00000 Median : 55670.9
24 : 7112 Mean :0.04074 Mean : 51846.7
3 : 210111 3rd Qu.:0.00000 3rd Qu.: 60637.3
gen:1963343 Max. :1.00000 Max. :108422.5
gdppercapita_scaled subscription_summary
Min. :-2.4363 mc-DK_ssc-MUT_ac-39_m-3_ccsl-gen : 109730
1st Qu.:-0.4640 mc-DK_ssc-MUT_ac-26_m-3_ccsl-gen : 77692
Median : 0.1805 mc-LowGeneric_ssc-SS_ac-0_m-1_ccsl-3: 62637
Mean : 0.0000 mc-DK_ssc-ORG_ac-0_m-3_ccsl-gen : 58395
3rd Qu.: 0.4148 mc-DK_ssc-MUT_ac-38_m-3_ccsl-gen : 52038
Max. : 2.6698 mc-DK_ssc-ORG_ac-3_m-3_ccsl-gen : 41923
(Other) :1898596
subscription_summary_no_market
ssc-MUT_ac-26_m-3_ccsl-gen: 204356
ssc-MUT_ac-39_m-3_ccsl-gen: 171697
ssc-SS_ac-0_m-1_ccsl-3 : 167341
ssc-ORG_ac-0_m-3_ccsl-gen : 132399
ssc-MUT_ac-8_m-3_ccsl-gen : 120692
ssc-MUT_ac-38_m-3_ccsl-gen: 111606
(Other) :1392920
subscriptions_with_target <- subscriptions %>%
# restrict to a recent expiry window
filter(endmonth >= begin_train_window & endmonth < end_window) %>%
mutate(num_previous_months_binned_fct = as.factor(num_previous_months_binned)) %>%
mutate(set_type = as.factor(if_else(endmonth >= begin_validation_window, 'validation', 'training'))) %>%
mutate(churnind = ifelse(status == 'churn', 1, 0))
Prepare churntable that we want to predict.
churntable <- subscriptions_with_target %>%
group_by(set_type, siteverkey_cat2, market_category, months, num_previous_months_binned, chosen_subs_length, subscription_summary_no_market) %>%
summarise(num_obs = n(),
churned = sum(churnind)) %>%
group_by(set_type) %>%
mutate(churn_rate = churned / num_obs,
renew_rate = 1 - churn_rate,
month_churn = 1 - renew_rate ^ (1/as.double(months)),
log_month_churn = log(month_churn),
weight = num_obs / sum(num_obs))
# NB! Does this introduce a bad bias ????
churntable_no_zeros <- churntable %>%
filter(churn_rate > 0)
Train model
new_model=glm(log_month_churn ~ market_category + subscription_summary_no_market, data=churntable_no_zeros[churntable_no_zeros$set_type == 'training', ], weights = weight)
Model validation for training (2017-01-01 - 2017-08-01) and validation (2017-09-01 - 2018-01-01) sets:
prediction_table <- validation(subscriptions_with_target, new_model, predict_2fct_model)
validation_plots(prediction_table, minimal_share = 0.01)
NAs introduced by coercion
Try simple logistic model
model_logit <- glm(churnind ~ market_category + siteverkey_cat2 + num_previous_months_binned + months + chosen_subs_length,
data = subscriptions_with_target[subscriptions_with_target$set_type == 'training',], family = 'binomial')
Model validation for training (2017-01-01 - 2017-08-01) and validation (2017-09-01 - 2018-01-01) sets:
prediction_table_logit <- validation(subscriptions_with_target, model_logit)
validation_plots(prediction_table_logit, minimal_share = 0.01)
NAs introduced by coercion